1. Data Processing¶
1.1 Tweets¶
import pandas as pd
#Load file
file_path = "stock_tweets.csv"
data = pd.read_csv(file_path)
data.head(10)
| Date | Tweet | Stock Name | Company Name | |
|---|---|---|---|---|
| 0 | 2022-09-29 23:41:16+00:00 | Mainstream media has done an amazing job at br... | TSLA | Tesla, Inc. |
| 1 | 2022-09-29 23:24:43+00:00 | Tesla delivery estimates are at around 364k fr... | TSLA | Tesla, Inc. |
| 2 | 2022-09-29 23:18:08+00:00 | 3/ Even if I include 63.0M unvested RSUs as of... | TSLA | Tesla, Inc. |
| 3 | 2022-09-29 22:40:07+00:00 | @RealDanODowd @WholeMarsBlog @Tesla Hahaha why... | TSLA | Tesla, Inc. |
| 4 | 2022-09-29 22:27:05+00:00 | @RealDanODowd @Tesla Stop trying to kill kids,... | TSLA | Tesla, Inc. |
| 5 | 2022-09-29 22:25:53+00:00 | @RealDanODowd @Tesla This is you https://t.co/... | TSLA | Tesla, Inc. |
| 6 | 2022-09-29 22:24:22+00:00 | For years @WholeMarsBlog viciously silenced @T... | TSLA | Tesla, Inc. |
| 7 | 2022-09-29 22:23:54+00:00 | $NIO just because I'm down money doesn't mean ... | TSLA | Tesla, Inc. |
| 8 | 2022-09-29 22:23:28+00:00 | 50 likes for some $SPY $TSLA charts to study!\... | TSLA | Tesla, Inc. |
| 9 | 2022-09-29 22:15:01+00:00 | @MrJames__321 @KellyRoofing @TeslaSolar @elonm... | TSLA | Tesla, Inc. |
#Check for shape
data.shape
(80793, 4)
#Check for columns
column_names = data.columns.tolist()
print("Column names in the dataset:")
print(column_names)
Column names in the dataset: ['Date', 'Tweet', 'Stock Name', 'Company Name']
#Transform date time
data['Date'] = pd.to_datetime(data['Date'])
data['Date'] = data['Date'].dt.date
data.head()
| Date | Tweet | Stock Name | Company Name | |
|---|---|---|---|---|
| 0 | 2022-09-29 | Mainstream media has done an amazing job at br... | TSLA | Tesla, Inc. |
| 1 | 2022-09-29 | Tesla delivery estimates are at around 364k fr... | TSLA | Tesla, Inc. |
| 2 | 2022-09-29 | 3/ Even if I include 63.0M unvested RSUs as of... | TSLA | Tesla, Inc. |
| 3 | 2022-09-29 | @RealDanODowd @WholeMarsBlog @Tesla Hahaha why... | TSLA | Tesla, Inc. |
| 4 | 2022-09-29 | @RealDanODowd @Tesla Stop trying to kill kids,... | TSLA | Tesla, Inc. |
#Filter out tweets of Tesla
tsla_data = data[data['Stock Name'] == 'TSLA']
#Drop out useless columns
tsla_data = tsla_data.drop(columns=['Company Name'])
tsla_data = tsla_data.drop(columns=['Stock Name'])
tsla_data.head()
| Date | Tweet | |
|---|---|---|
| 0 | 2022-09-29 | Mainstream media has done an amazing job at br... |
| 1 | 2022-09-29 | Tesla delivery estimates are at around 364k fr... |
| 2 | 2022-09-29 | 3/ Even if I include 63.0M unvested RSUs as of... |
| 3 | 2022-09-29 | @RealDanODowd @WholeMarsBlog @Tesla Hahaha why... |
| 4 | 2022-09-29 | @RealDanODowd @Tesla Stop trying to kill kids,... |
#Reverse the dataset
tsla_data = tsla_data.iloc[::-1]
tsla_data = tsla_data.reset_index(drop=True)
tsla_data.head()
| Date | Tweet | |
|---|---|---|
| 0 | 2021-09-30 | In other words, AMD has been giving Tesla pref... |
| 1 | 2021-09-30 | Get ready for a $TSLA _ _ _ _ _ _ Q3 delivery... |
| 2 | 2021-09-30 | Hold. On. Tight. $TSLA |
| 3 | 2021-09-30 | I agree with @freshjiva that $TSLA ‘s EV busin... |
| 4 | 2021-09-30 | Playing in the dirt and #chasingsunsets\n@tesl... |
#Processed dataset
tsla_data.shape
(37422, 2)
#Check for date
start_date = tsla_data['Date'].min()
end_date = tsla_data['Date'].max()
print(f"Start Date: {start_date}")
print(f"End Date: {end_date}")
Start Date: 2021-09-30 End Date: 2022-09-29
import matplotlib.pyplot as plt
#Visualize daily tweet count
daily_tweet_counts = tsla_data.groupby('Date').size().reset_index(name='Tweet Count')
print("Daily Tweet Counts:")
print(daily_tweet_counts)
#Visualize
plt.figure(figsize=(10, 6))
plt.plot(daily_tweet_counts['Date'], daily_tweet_counts['Tweet Count'], linestyle='-')
plt.title("Daily Tweet Counts")
plt.xlabel("Date")
plt.ylabel("Number of Tweets")
plt.grid(True)
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()
Daily Tweet Counts:
Date Tweet Count
0 2021-09-30 90
1 2021-10-01 94
2 2021-10-02 116
3 2021-10-03 61
4 2021-10-04 119
.. ... ...
360 2022-09-25 36
361 2022-09-26 72
362 2022-09-27 85
363 2022-09-28 75
364 2022-09-29 112
[365 rows x 2 columns]
#Save file
new_file_path = "TSLA_Tweets_data.csv"
tsla_data.to_csv(new_file_path, index=False)
#Save daily tweet count as csv
daily_tweet_counts = pd.DataFrame(daily_tweet_counts)
daily_tweet_counts.to_csv("daily_tweet_counts.csv", index = False)
1.2 Stock¶
#Load file for stock
file_path = "stock_yfinance_data.csv"
data = pd.read_csv(file_path)
data.head(5)
| Date | Open | High | Low | Close | Adj Close | Volume | Stock Name | |
|---|---|---|---|---|---|---|---|---|
| 0 | 2021-09-30 | 260.333344 | 263.043335 | 258.333344 | 258.493347 | 258.493347 | 53868000 | TSLA |
| 1 | 2021-10-01 | 259.466675 | 260.260010 | 254.529999 | 258.406677 | 258.406677 | 51094200 | TSLA |
| 2 | 2021-10-04 | 265.500000 | 268.989990 | 258.706665 | 260.510010 | 260.510010 | 91449900 | TSLA |
| 3 | 2021-10-05 | 261.600006 | 265.769989 | 258.066681 | 260.196655 | 260.196655 | 55297800 | TSLA |
| 4 | 2021-10-06 | 258.733337 | 262.220001 | 257.739990 | 260.916656 | 260.916656 | 43898400 | TSLA |
#Check for shape
data.shape
(6300, 8)
#Check for columns
column_names = data.columns.tolist()
print("Column names in the dataset:")
print(column_names)
Column names in the dataset: ['Date', 'Open', 'High', 'Low', 'Close', 'Adj Close', 'Volume', 'Stock Name']
#Check for all companies
data['Stock Name'].unique()
array(['TSLA', 'MSFT', 'PG', 'META', 'AMZN', 'GOOG', 'AMD', 'AAPL',
'NFLX', 'TSM', 'KO', 'F', 'COST', 'DIS', 'VZ', 'CRM', 'INTC', 'BA',
'BX', 'NOC', 'PYPL', 'ENPH', 'NIO', 'ZS', 'XPEV'], dtype=object)
#Count of each stock name in the dataset
stock_counts = data['Stock Name'].value_counts()
print("Count of each stock name in the dataset:")
print(stock_counts)
Count of each stock name in the dataset: Stock Name TSLA 252 DIS 252 ZS 252 NIO 252 ENPH 252 PYPL 252 NOC 252 BX 252 BA 252 INTC 252 CRM 252 VZ 252 COST 252 MSFT 252 F 252 KO 252 TSM 252 NFLX 252 AAPL 252 AMD 252 GOOG 252 AMZN 252 META 252 PG 252 XPEV 252 Name: count, dtype: int64
#Filter out TSLA stock
tsla_stock_data = data[data['Stock Name'] == 'TSLA']
#Transfer date time data
tsla_stock_data['Date'] = pd.to_datetime(tsla_stock_data['Date'])
tsla_stock_data.head(5)
C:\Users\LJT19\AppData\Local\Temp\ipykernel_10904\4277104264.py:4: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame. Try using .loc[row_indexer,col_indexer] = value instead See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy tsla_stock_data['Date'] = pd.to_datetime(tsla_stock_data['Date'])
| Date | Open | High | Low | Close | Adj Close | Volume | Stock Name | |
|---|---|---|---|---|---|---|---|---|
| 0 | 2021-09-30 | 260.333344 | 263.043335 | 258.333344 | 258.493347 | 258.493347 | 53868000 | TSLA |
| 1 | 2021-10-01 | 259.466675 | 260.260010 | 254.529999 | 258.406677 | 258.406677 | 51094200 | TSLA |
| 2 | 2021-10-04 | 265.500000 | 268.989990 | 258.706665 | 260.510010 | 260.510010 | 91449900 | TSLA |
| 3 | 2021-10-05 | 261.600006 | 265.769989 | 258.066681 | 260.196655 | 260.196655 | 55297800 | TSLA |
| 4 | 2021-10-06 | 258.733337 | 262.220001 | 257.739990 | 260.916656 | 260.916656 | 43898400 | TSLA |
## Check for duplicates
duplicates = tsla_stock_data[tsla_stock_data.duplicated(subset='Date', keep=False)]
if not duplicates.empty:
print("Duplicates found:")
print(duplicates)
else:
print("No duplicates found.")
# Check for missing values
missing_values = tsla_stock_data.isnull().sum()
if missing_values.sum() > 0:
print("Missing values found:")
print(missing_values)
else:
print("No missing values found.")
No duplicates found. No missing values found.
#Check for date
start_date = tsla_stock_data['Date'].min()
end_date = tsla_stock_data['Date'].max()
print(f"Start Date: {start_date}")
print(f"End Date: {end_date}")
Start Date: 2021-09-30 00:00:00 End Date: 2022-09-29 00:00:00
#Visualization for Stock price over time
import matplotlib.dates as mdates
data_columns = {
'Open': {'color': 'blue', 'linestyle': '--'},
'Close': {'color': 'red', 'linestyle': '-'},
'High': {'color': 'green', 'linestyle': '-.', 'linewidth': 1.5},
'Low': {'color': 'orange', 'linestyle': ':', 'linewidth': 1.5}
}
plt.figure(figsize=(12, 8))
for column, style in data_columns.items():
plt.plot(tsla_stock_data['Date'], tsla_stock_data[column], label=column, **style)
plt.title("Stock Price of Tesla Over Time", fontsize=16, fontweight='bold')
plt.xlabel("Date", fontsize=14)
plt.ylabel("Price", fontsize=14)
plt.legend(fontsize=12)
plt.grid(True, linestyle='--', alpha=0.7)
plt.gca().xaxis.set_major_locator(mdates.MonthLocator(interval=2))
plt.gca().xaxis.set_major_formatter(mdates.DateFormatter('%Y-%m'))
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()
#Save
#Delete Stock Name column
tsla_stock_data = tsla_stock_data.drop(columns=['Stock Name'])
new_file_path = "TSLA_stock_data.csv"
tsla_stock_data.to_csv(new_file_path, index=False)
#Processed data info
tsla_data
| Date | Tweet | |
|---|---|---|
| 0 | 2021-09-30 | In other words, AMD has been giving Tesla pref... |
| 1 | 2021-09-30 | Get ready for a $TSLA _ _ _ _ _ _ Q3 delivery... |
| 2 | 2021-09-30 | Hold. On. Tight. $TSLA |
| 3 | 2021-09-30 | I agree with @freshjiva that $TSLA ‘s EV busin... |
| 4 | 2021-09-30 | Playing in the dirt and #chasingsunsets\n@tesl... |
| ... | ... | ... |
| 37417 | 2022-09-29 | @RealDanODowd @Tesla Stop trying to kill kids,... |
| 37418 | 2022-09-29 | @RealDanODowd @WholeMarsBlog @Tesla Hahaha why... |
| 37419 | 2022-09-29 | 3/ Even if I include 63.0M unvested RSUs as of... |
| 37420 | 2022-09-29 | Tesla delivery estimates are at around 364k fr... |
| 37421 | 2022-09-29 | Mainstream media has done an amazing job at br... |
37422 rows × 2 columns
1.3 Stock Index¶
import pandas as pd
# Load file
file_path = 'HistoricalData_1742894586653.csv'
data = pd.read_csv(file_path)
# Convert the date column to datetime format
data['Date'] = pd.to_datetime(data['Date'], format='%m/%d/%Y')
# Filter data within the specified date range
start_date = '2021-09-30'
end_date = '2022-09-29'
filtered_data = data[(data['Date'] >= start_date) & (data['Date'] <= end_date)]
# Sort the filtered data by date in ascending order
filtered_data = filtered_data.sort_values(by='Date', ascending=True)
# Extract the closing price column and rename the column names
Nasdaq = filtered_data[['Date', 'Close/Open']]
Nasdaq.columns = ['Date', 'Nasdaq_Index'] # Rename column names
Nasdaq = Nasdaq.reset_index(drop=True)
# Print the result
print(Nasdaq)
Nasdaq.to_csv('Nasdaq_Index.csv', index=False)
Date Nasdaq_Index 0 2021-09-30 14689.62 1 2021-10-01 14791.87 2 2021-10-04 14472.12 3 2021-10-05 14674.15 4 2021-10-06 14766.75 .. ... ... 247 2022-09-23 11311.24 248 2022-09-26 11254.11 249 2022-09-27 11271.75 250 2022-09-28 11493.83 251 2022-09-29 11164.78 [252 rows x 2 columns]
# Load file of sp500 index
file_path = 'sp500_index.csv'
data = pd.read_csv(file_path)
data
| Date | S&P500 | Nasdaq_Index | |
|---|---|---|---|
| 0 | 2021-09-30 | 4307.54 | 14689.62 |
| 1 | 2021-10-01 | 4357.04 | 14791.87 |
| 2 | 2021-10-04 | 4300.46 | 14472.12 |
| 3 | 2021-10-05 | 4345.72 | 14674.15 |
| 4 | 2021-10-06 | 4363.55 | 14766.75 |
| ... | ... | ... | ... |
| 247 | 2022-09-23 | 3693.23 | 11311.24 |
| 248 | 2022-09-26 | 3655.04 | 11254.11 |
| 249 | 2022-09-27 | 3647.29 | 11271.75 |
| 250 | 2022-09-28 | 3719.04 | 11493.83 |
| 251 | 2022-09-29 | 3640.47 | 11164.78 |
252 rows × 3 columns
#filter out date from 2021.9.30 to 2022.9.29
filtered_data = data[(data['Date'] >= start_date) & (data['Date'] <= end_date)]
# reset index
filtered_data = filtered_data.reset_index(drop=True)
# merge two index
filtered_data['Nasdaq_Index'] = Nasdaq['Nasdaq_Index']
filtered_data
| Date | S&P500 | Nasdaq_Index | |
|---|---|---|---|
| 0 | 2021-09-30 | 4307.54 | 14689.62 |
| 1 | 2021-10-01 | 4357.04 | 14791.87 |
| 2 | 2021-10-04 | 4300.46 | 14472.12 |
| 3 | 2021-10-05 | 4345.72 | 14674.15 |
| 4 | 2021-10-06 | 4363.55 | 14766.75 |
| ... | ... | ... | ... |
| 247 | 2022-09-23 | 3693.23 | 11311.24 |
| 248 | 2022-09-26 | 3655.04 | 11254.11 |
| 249 | 2022-09-27 | 3647.29 | 11271.75 |
| 250 | 2022-09-28 | 3719.04 | 11493.83 |
| 251 | 2022-09-29 | 3640.47 | 11164.78 |
252 rows × 3 columns
#Save file
filtered_data.to_csv('Index.csv', index=False)
# Visualization of two index over time
import matplotlib.pyplot as plt
import matplotlib.dates as mdates
from datetime import datetime
filtered_data['Date'] = pd.to_datetime(filtered_data['Date'])
filtered_data = filtered_data.sort_values(by='Date')
plt.figure(figsize=(10, 6))
plt.plot(filtered_data['Date'], filtered_data['Nasdaq_Index'], linestyle='-')
plt.title("Nasdaq Index Over Time")
plt.xlabel("Date")
plt.ylabel("Nasdaq Index")
plt.grid(True)
plt.gca().xaxis.set_major_locator(mdates.MonthLocator())
plt.gca().xaxis.set_major_formatter(mdates.DateFormatter('%Y-%m'))
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()
plt.figure(figsize=(10, 6))
plt.plot(filtered_data['Date'], filtered_data['S&P500'], color='red', linestyle='-')
plt.title("S&P 500 Index Over Time")
plt.xlabel("Date")
plt.ylabel("S&P 500 Index")
plt.grid(True)
plt.gca().xaxis.set_major_locator(mdates.MonthLocator())
plt.gca().xaxis.set_major_formatter(mdates.DateFormatter('%Y-%m'))
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()
2. Sentiment Analysis¶
2.1 Generate sentiment score¶
import pandas as pd
import numpy as np
import nltk
from nltk.sentiment.vader import SentimentIntensityAnalyzer
import unicodedata
tsla_data.head()
| Date | Tweet | |
|---|---|---|
| 0 | 2021-09-30 | In other words, AMD has been giving Tesla pref... |
| 1 | 2021-09-30 | Get ready for a $TSLA _ _ _ _ _ _ Q3 delivery... |
| 2 | 2021-09-30 | Hold. On. Tight. $TSLA |
| 3 | 2021-09-30 | I agree with @freshjiva that $TSLA ‘s EV busin... |
| 4 | 2021-09-30 | Playing in the dirt and #chasingsunsets\n@tesl... |
# Add sentiment columns
sent_df = tsla_data.copy()
sent_df["sentiment_score"] = ''
sent_df["Positive"] = ''
sent_df["Neutral"] = ''
sent_df["Negative"] = ''
sent_df.head()
| Date | Tweet | sentiment_score | Positive | Neutral | Negative | |
|---|---|---|---|---|---|---|
| 0 | 2021-09-30 | In other words, AMD has been giving Tesla pref... | ||||
| 1 | 2021-09-30 | Get ready for a $TSLA _ _ _ _ _ _ Q3 delivery... | ||||
| 2 | 2021-09-30 | Hold. On. Tight. $TSLA | ||||
| 3 | 2021-09-30 | I agree with @freshjiva that $TSLA ‘s EV busin... | ||||
| 4 | 2021-09-30 | Playing in the dirt and #chasingsunsets\n@tesl... |
%%time
# Generate Sentiment score by SentimentIntensityAnalyzer
sentiment_analyzer = SentimentIntensityAnalyzer()
for indx, row in sent_df.T.items():
try:
sentence_i = unicodedata.normalize('NFKD', sent_df.loc[indx, 'Tweet'])
sentence_sentiment = sentiment_analyzer.polarity_scores(sentence_i)
sent_df.at[indx, 'sentiment_score'] = sentence_sentiment['compound']
sent_df.at[indx, 'Negative'] = sentence_sentiment['neg']
sent_df.at[indx, 'Neutral'] = sentence_sentiment['neu']
sent_df.at[indx, 'Positive'] = sentence_sentiment['pos']
except TypeError:
print (sent_df.loc[indexx, 'Tweet'])
print (indx)
break
CPU times: total: 8.52 s Wall time: 8.54 s
# Check for result
sent_df.head()
| Date | Tweet | sentiment_score | Positive | Neutral | Negative | |
|---|---|---|---|---|---|---|
| 0 | 2021-09-30 | In other words, AMD has been giving Tesla pref... | 0.659 | 0.166 | 0.834 | 0.0 |
| 1 | 2021-09-30 | Get ready for a $TSLA _ _ _ _ _ _ Q3 delivery... | 0.4215 | 0.257 | 0.743 | 0.0 |
| 2 | 2021-09-30 | Hold. On. Tight. $TSLA | 0.0 | 0.0 | 1.0 | 0.0 |
| 3 | 2021-09-30 | I agree with @freshjiva that $TSLA ‘s EV busin... | 0.5719 | 0.175 | 0.747 | 0.078 |
| 4 | 2021-09-30 | Playing in the dirt and #chasingsunsets\n@tesl... | -0.1531 | 0.148 | 0.656 | 0.197 |
# Convert the 'Date' column to datetime format
sent_df['Date'] = pd.to_datetime(sent_df['Date'])
# Extract only the date part from the datetime column (remove time information)
sent_df['Date'] = sent_df['Date'].dt.date
#sent_df = sent_df.drop(columns=['Negative', 'Positive', 'Neutral', 'Stock Name', 'Company Name'])
sent_df.head()
| Date | Tweet | sentiment_score | Positive | Neutral | Negative | |
|---|---|---|---|---|---|---|
| 0 | 2021-09-30 | In other words, AMD has been giving Tesla pref... | 0.659 | 0.166 | 0.834 | 0.0 |
| 1 | 2021-09-30 | Get ready for a $TSLA _ _ _ _ _ _ Q3 delivery... | 0.4215 | 0.257 | 0.743 | 0.0 |
| 2 | 2021-09-30 | Hold. On. Tight. $TSLA | 0.0 | 0.0 | 1.0 | 0.0 |
| 3 | 2021-09-30 | I agree with @freshjiva that $TSLA ‘s EV busin... | 0.5719 | 0.175 | 0.747 | 0.078 |
| 4 | 2021-09-30 | Playing in the dirt and #chasingsunsets\n@tesl... | -0.1531 | 0.148 | 0.656 | 0.197 |
# Save file
sent_df.to_csv('Tweet_sentiment_score.csv', encoding='utf-8',index=False)
sent_df.groupby('Date')['sentiment_score'].mean().head(20)
Date 2021-09-30 0.231552 2021-10-01 0.233704 2021-10-02 0.27194 2021-10-03 0.27157 2021-10-04 0.135388 2021-10-05 0.069445 2021-10-06 0.19994 2021-10-07 0.192548 2021-10-08 0.220011 2021-10-09 0.294931 2021-10-10 0.244551 2021-10-11 0.185286 2021-10-12 0.191255 2021-10-13 0.160097 2021-10-14 0.158425 2021-10-15 0.080318 2021-10-16 0.220176 2021-10-17 0.215528 2021-10-18 0.219143 2021-10-19 0.155566 Name: sentiment_score, dtype: object
# Calculate average of sentiment score
twitter_df = sent_df.groupby('Date')['sentiment_score'].mean()
print(twitter_df.shape)
# Save file
twitter_df.to_csv('Tweet_sentiment_score(every day).csv', encoding='utf-8',index=True)
(365,)
sent_df['Date'] = pd.to_datetime(sent_df['Date'])
##weekly_sentiment_trade = sent_df.groupby(pd.Grouper(key='Date', freq='W-FRI'))['sentiment_score'].mean() #freq='W-FRI'
weekly_sentiment_trade = sent_df.groupby(pd.Grouper(key='Date', freq='W'))['sentiment_score'].mean()
weekly_sentiment_trade.head()
# Average sentiment score on a weekly basis (up to Sunday).
#weekly_sentiment_trade.to_csv('Tweet_sentiment_score(every week).csv', encoding='utf-8',index=True)
Date 2021-10-03 0.251852 2021-10-10 0.188605 2021-10-17 0.166151 2021-10-24 0.184919 2021-10-31 0.196194 Freq: W-SUN, Name: sentiment_score, dtype: object
monthly_sentiment = sent_df.groupby(pd.Grouper(key='Date', freq='ME'))['sentiment_score'].mean()
monthly_sentiment.head()
#The average sentiment score on a monthly basis (up to the end of the month).
#monthly_sentiment.to_csv('Tweet_sentiment_score(every month).csv', encoding='utf-8',index=True)
Date 2021-09-30 0.231552 2021-10-31 0.191146 2021-11-30 0.191373 2021-12-31 0.188472 2022-01-31 0.14224 Freq: ME, Name: sentiment_score, dtype: object
#categorize sentiment scores into 'positive', 'negative', or 'neutral'
sent_df = sent_df.drop(columns=['Negative', 'Positive', 'Neutral'])
def categorize_sentiment(score):
if score >= 0.5:
return 'positive'
elif score <= -0.5:
return 'negative'
else:
return 'neutral'
sent_df['sentiment'] = sent_df['sentiment_score'].apply(categorize_sentiment)
sent_df.head()
| Date | Tweet | sentiment_score | sentiment | |
|---|---|---|---|---|
| 0 | 2021-09-30 | In other words, AMD has been giving Tesla pref... | 0.659 | positive |
| 1 | 2021-09-30 | Get ready for a $TSLA _ _ _ _ _ _ Q3 delivery... | 0.4215 | neutral |
| 2 | 2021-09-30 | Hold. On. Tight. $TSLA | 0.0 | neutral |
| 3 | 2021-09-30 | I agree with @freshjiva that $TSLA ‘s EV busin... | 0.5719 | positive |
| 4 | 2021-09-30 | Playing in the dirt and #chasingsunsets\n@tesl... | -0.1531 | neutral |
# Categorize sentiment scores into numerical values:
def categorize_sentiment2(score):
if score >= 0.4:
return 1
elif score <= -0.4:
return 0
else:
return 0.5
sent_df['sentiment'] = sent_df['sentiment_score'].apply(categorize_sentiment2)
sent_df.head()
| Date | Tweet | sentiment_score | sentiment | |
|---|---|---|---|---|
| 0 | 2021-09-30 | In other words, AMD has been giving Tesla pref... | 0.659 | 1.0 |
| 1 | 2021-09-30 | Get ready for a $TSLA _ _ _ _ _ _ Q3 delivery... | 0.4215 | 1.0 |
| 2 | 2021-09-30 | Hold. On. Tight. $TSLA | 0.0 | 0.5 |
| 3 | 2021-09-30 | I agree with @freshjiva that $TSLA ‘s EV busin... | 0.5719 | 1.0 |
| 4 | 2021-09-30 | Playing in the dirt and #chasingsunsets\n@tesl... | -0.1531 | 0.5 |
#sent_df = sent_df.drop(columns=['Negative', 'Positive', 'Neutral'])
#sent_df.head()
# Save the file (sentiment scores of all tweets)
#sent_df.to_csv('Tweet_sentiment_score_category.csv', encoding='utf-8',index=False)
2.2 LSTM for sentiment analysis¶
The following is modified from the tutorial content¶
import pandas as pd
import numpy as np
import nltk
nltk.download('punkt') # Used for sentence tokenizer
nltk.download('stopwords')
nltk.download('punkt_tab')
import string
from nltk.tokenize import word_tokenize #nltk: natural language toolkit
from nltk.corpus import stopwords
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
import matplotlib.pyplot as plt
import seaborn as sns
[nltk_data] Downloading package punkt to [nltk_data] C:\Users\LJT19\AppData\Roaming\nltk_data... [nltk_data] Package punkt is already up-to-date! [nltk_data] Downloading package stopwords to [nltk_data] C:\Users\LJT19\AppData\Roaming\nltk_data... [nltk_data] Package stopwords is already up-to-date! [nltk_data] Downloading package punkt_tab to [nltk_data] C:\Users\LJT19\AppData\Roaming\nltk_data... [nltk_data] Package punkt_tab is already up-to-date!
lstm_df = sent_df.copy()
# Preprocessing text data
review_lines = list()
lines = lstm_df['Tweet'].values.tolist()
for line in lines:
tokens = word_tokenize(line)
# convert to lower case
tokens = [w.lower() for w in tokens]
# remove punctuation from each word
table = str.maketrans('', '', string.punctuation)
stripped = [w.translate(table) for w in tokens]
# remove remaining tokens that are not alphabetic
words = [word for word in stripped if word.isalpha()]
# filter out stop words
stop_words = set(stopwords.words('english'))
words = [w for w in words if not w in stop_words]
review_lines.append(words)
# check the number of reviews and the data frame shape
len(review_lines)
lstm_df.shape
(37422, 4)
import gensim
EMBEDDING_DIM = 100
# Train Word2Vec model
model = gensim.models.Word2Vec(sentences=review_lines, vector_size=EMBEDDING_DIM, window=5, min_count=1)
# Get vocabulary size
words = list(model.wv.index_to_key) # Updated method for getting vocab
print('Vocabulary size: %d' % len(words))
Vocabulary size: 30515
print(model.wv['root'])
print(len(model.wv['root']))
[-0.02639537 0.04859987 0.02323065 0.01520963 0.02069867 -0.06345103 0.00237939 0.10403991 -0.02221519 -0.01791887 -0.00426699 -0.06221278 -0.00984189 0.01198509 0.02840637 -0.03173357 0.02984535 -0.02477416 -0.01590225 -0.06706116 0.01942845 0.01528488 0.01606948 -0.01145657 -0.00077515 -0.00854264 -0.00818149 -0.01306987 -0.03472029 0.00400812 0.0404654 0.00851348 0.01482215 -0.04922578 -0.01271665 0.05398022 0.03191518 -0.0242642 -0.0316348 -0.00399408 -0.00492199 -0.02592164 -0.01037458 0.00309789 0.03754494 0.00466243 -0.04174955 -0.02683674 0.03998186 0.01295525 0.02872487 -0.03530835 -0.04714225 -0.0012458 -0.02732489 -0.00957228 0.03153846 -0.004599 -0.02782652 0.00469119 -0.00240873 0.01199539 0.01057674 -0.03354356 -0.03389709 0.00507666 0.01660249 0.01945366 -0.04706703 0.01470122 0.00744584 0.03220437 0.04197431 0.00299182 0.02270749 0.02610776 0.00666063 -0.00461335 -0.02119539 0.00996126 -0.03195732 0.00079824 0.00183682 0.0245774 -0.00269652 0.00316148 0.00894271 0.02376296 0.04649055 0.0071784 0.0124121 0.01912157 0.01785881 -0.01374316 0.05322729 0.02678355 0.02244523 -0.03449194 -0.01159479 -0.00490714] 100
# save model in ASCII (word2vec) format
filename = 'group_project_embedding_word2vec.txt'
model.wv.save_word2vec_format(filename, binary=False)
# Finding out similar words
model.wv.most_similar('tesla', topn =5)
model.wv.similar_by_word("car")
[('ever', 0.9052832722663879),
('miles', 0.9046816229820251),
('guzzling', 0.9017306566238403),
('teslas', 0.9010254144668579),
('safest', 0.8957082033157349),
('carriers', 0.8872512578964233),
('drive', 0.8776369094848633),
('world', 0.8759986758232117),
('fastest', 0.8705390691757202),
('autonomous', 0.8698485493659973)]
# Find words related to "positive" (to measure positive sentiment)
print(model.wv.most_similar('positive', topn=10))
# Find words related to "negative" (to measure negative sentiment)
print(model.wv.most_similar('negative', topn=10))
[('continue', 0.9677192568778992), ('fundamentals', 0.9598056077957153), ('fall', 0.9556040167808533), ('ridiculously', 0.9524328112602234), ('cause', 0.9514209032058716), ('raise', 0.9509127736091614), ('reflect', 0.9463772773742676), ('assuming', 0.9457159638404846), ('given', 0.9450112581253052), ('peaked', 0.9441218972206116)]
[('worry', 0.9699762463569641), ('portfolios', 0.969102144241333), ('strategy', 0.968085765838623), ('cover', 0.9671775698661804), ('cushion', 0.9660492539405823), ('pain', 0.9642733335494995), ('institutions', 0.9641199707984924), ('fear', 0.9635016918182373), ('fundamentally', 0.963267982006073), ('confidence', 0.9623303413391113)]
# Word that best satisfies analogy relations
# Example usage of analogy (commented out):
# model.wv.most_similar_cosmul(positive=['woman', 'king'], negative=['man'])
# model.wv.most_similar(positive=['woman', 'king'], negative=['man'])
# Analyze analogy relationships
# For example, "elon" is to "tesla" as "tim_cook" is to what?
# Here, we are trying to find a word that completes the analogy:
# "boss" is to "tesla" as "elon" is to what?
model.wv.most_similar(positive=['boss', 'tesla'], negative=['elon'], topn=5)
[('beautiful', 0.853935718536377),
('stargazerplaid', 0.8470975160598755),
('gigaworkshop', 0.8423700928688049),
('tesmaniancom', 0.8418003916740417),
('snowblower', 0.8417527079582214)]
# Finding the odd word out
print(model.wv.doesnt_match("tesla autopilot software battery recall".split()))
print(model.wv.doesnt_match("elonmusk ceo innovation problem success".split()))
tesla elonmusk
# Similarity between words
#print(model.wv.similarity('cat', 'movie'))
# Suppress excessive printing and load embeddings efficiently
embeddings_index = {}
with open('group_project_embedding_word2vec.txt', 'r', encoding='utf-8') as f:
for i, line in enumerate(f):
values = line.split()
word = values[0] # First value is the word
coefs = np.asarray(values[1:], dtype='float32') # Remaining values are embeddings
embeddings_index[word] = coefs
print(f"Loaded {len(embeddings_index)} word vectors.")
Loaded 30516 word vectors.
#Let's check the vector for word "men"
embeddings_index['tesla']
array([-0.71105003, 0.87905407, -0.46093175, 0.2798589 , 0.4994845 ,
-1.2032409 , -0.347253 , 2.1046436 , 0.04646157, -0.64542085,
-0.7883267 , -0.55146027, -0.04696903, 0.62486345, -0.13276118,
-0.3724636 , 1.2867846 , 0.6873424 , -0.07962274, -1.2975892 ,
-0.09589551, -0.18267868, 0.44770238, -1.6075722 , 1.4333322 ,
-0.6432317 , -0.28130063, 0.16019958, -0.25701803, 0.18099813,
0.56565833, -0.21602117, -1.2137904 , -1.0170438 , -0.12016281,
1.2887949 , 0.18248892, -0.5955466 , 0.0161146 , -0.3073855 ,
0.29642892, 0.06306926, -0.51192147, -0.43064877, 0.6596848 ,
1.2730328 , -1.4054806 , -1.1332502 , 0.661447 , 0.3076126 ,
0.9193487 , -0.6246483 , -1.6771576 , 0.51781726, -0.02376858,
-0.58212614, 0.05695917, -0.00985799, -0.8893998 , -1.0885311 ,
0.23131801, 0.8802511 , 0.03330267, -0.66670716, -0.2246818 ,
0.29192406, 1.3226191 , -1.0850828 , -0.8439994 , 0.17294247,
-0.25068647, 1.3726964 , 0.51728827, 0.04200741, -0.01277055,
0.49540782, 0.49373797, -0.71228194, -0.9718147 , 1.1175257 ,
0.3502359 , -0.15345189, -0.33879098, 0.14311174, -0.15162936,
0.48832226, -1.1551956 , -0.74478936, 0.6889191 , -0.22727433,
0.52805185, 0.74544823, 0.60426563, -0.6128503 , 1.7455885 ,
0.39142498, 1.1795077 , -0.7301047 , 0.39732546, 0.08327533],
dtype=float32)
import torch
from torch.nn.utils.rnn import pad_sequence
from nltk.tokenize import word_tokenize
VALIDATION_SPLIT = 0.2
# Use NLTK tokenizer
tokenizer_obj = word_tokenize
# Tokenize the text and convert words to indices
vocab = {} # Dictionary to store word indices
word_index = {}
index = 1 # Start indexing from 1 (0 is reserved for padding)
# Build vocabulary from the dataset
tokenized_texts = []
for line in review_lines:
if isinstance(line, list):
line = ' '.join(line) # Convert list to string if necessary
tokens = tokenizer_obj(line) # Tokenize using NLTK
tokenized_texts.append(tokens)
for token in tokens:
if token not in vocab:
vocab[token] = index
word_index[token] = index
index += 1
# Convert tokens to numerical sequences
sequences = [[vocab[token] for token in tokens] for tokens in tokenized_texts]
# Padding sequences
sequences = [torch.tensor(seq, dtype=torch.long) for seq in sequences]
sequences = pad_sequence(sequences, batch_first=True, padding_value=0)
print("Vocabulary size:", len(vocab))
print("Sample tokenized sequence:", sequences[0])
Vocabulary size: 30515
Sample tokenized sequence: tensor([ 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0])
max_length = 100 # Maximum length of sentences
# Pad sequences
print('Found %s unique tokens.' % len(vocab))
# Convert to tensor and pad
review_pad = pad_sequence(sequences, batch_first=True, padding_value=0) # Padding with 0
review_pad = review_pad[:, :max_length] # Ensure max_length constraint
sentiment = torch.tensor(lstm_df['sentiment'].values, dtype=torch.long)
print('Shape of review tensor:', review_pad.shape)
print('Shape of sentiment tensor:', sentiment.shape)
Found 30515 unique tokens. Shape of review tensor: torch.Size([37422, 49]) Shape of sentiment tensor: torch.Size([37422])
review_pad[2000]
tensor([ 603, 225, 104, 1250, 179, 2544, 1540, 6054, 6055, 465, 6056, 14,
4982, 875, 15, 6057, 15, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0])
# Split the dataset into training and validation
# Create indices using torch.arange
indices = torch.arange(sequences.shape[0])
print("Original indices:", indices)
# Apply the shuffled indices to sequences and sentiment tensors
sequences = sequences[indices]
sentiment_tensor = sentiment[indices]
# Calculate the number of validation samples
num_validation_samples = int(VALIDATION_SPLIT * sequences.shape[0])
print("Number of validation samples:", num_validation_samples)
Original indices: tensor([ 0, 1, 2, ..., 37419, 37420, 37421]) Number of validation samples: 7484
# Get the number of samples
num_samples = len(review_pad)
# Generate shuffled indices
indices = np.arange(num_samples)
np.random.seed(42) # Set random seed for reproducibility
np.random.shuffle(indices)
# Apply shuffled indices to split data
X_shuffled = review_pad[indices]
y_shuffled = sentiment[indices]
# Perform split
num_validation_samples = int(0.2 * num_samples) # Assuming 80-20 train-test split
X_train_pad = X_shuffled[:-num_validation_samples]
y_train = y_shuffled[:-num_validation_samples]
X_test_pad = X_shuffled[-num_validation_samples:]
y_test = y_shuffled[-num_validation_samples:]
print('Shape of Training dataset X:', X_train_pad.shape)
print('Shape of Training dataset Y:', y_train.shape)
print('Shape of Validation dataset X:', X_test_pad.shape)
print('Shape of Valodation daatset Y:', y_test.shape)
Shape of Training dataset X: torch.Size([29938, 49]) Shape of Training dataset Y: torch.Size([29938]) Shape of Validation dataset X: torch.Size([7484, 49]) Shape of Valodation daatset Y: torch.Size([7484])
# Define embedding dimensions
EMBEDDING_DIM = 100
# Define model hyperparameters
max_length = 100
# Number of words in vocabulary (+1 for padding index)
num_words = len(word_index) + 1
# Initialize the embedding matrix as a tensor filled with zeros
embedding_matrix = torch.zeros((num_words, EMBEDDING_DIM))
# Populate embedding matrix with pre-trained embeddings
for word, i in word_index.items():
embedding_vector = embeddings_index.get(word)
if embedding_vector is not None:
# Convert embedding_vector to a PyTorch tensor and assign it
embedding_matrix[i] = torch.tensor(embedding_vector, dtype=torch.float32)
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset
# Hyperparameters
batch_size = 128
epochs = 15
X_train_tensor = X_train_pad.clone().detach().to(dtype=torch.long)
y_train_tensor = y_train.clone().detach().to(dtype=torch.float32)
X_test_tensor = X_test_pad.clone().detach().to(dtype=torch.long)
y_test_tensor = y_test.clone().detach().to(dtype=torch.float32)
# Create DataLoader for training and validation
train_data = TensorDataset(X_train_tensor, y_train_tensor)
test_data = TensorDataset(X_test_tensor, y_test_tensor)
train_loader = DataLoader(train_data, batch_size=batch_size, shuffle=True)
test_loader = DataLoader(test_data, batch_size=batch_size, shuffle=True)
# Define the Model
class SentimentModel(nn.Module):
def __init__(self, num_words, EMBEDDING_DIM, embedding_matrix, max_length):
super(SentimentModel, self).__init__()
self.embedding_layer = nn.Embedding.from_pretrained(embedding_matrix, freeze=True, padding_idx=0)
# Single-layer LSTM with dropout
self.lstm_layer = nn.LSTM(
input_size=EMBEDDING_DIM,
hidden_size=32,
num_layers=1,
batch_first=True,
dropout=0, # Dropout is ignored for single-layer LSTMs
)
# Fully connected layer
self.fc_layer = nn.Linear(32, 1)
# Activation function
self.sigmoid = nn.Sigmoid()
def forward(self, x):
embedded = self.embedding_layer(x)
lstm_out, _ = self.lstm_layer(embedded) # LSTM returns (output, (h_n, c_n))
lstm_out = lstm_out[:, -1, :] # Take last LSTM output
logits = self.fc_layer(lstm_out)
return self.sigmoid(logits).squeeze() # Ensure output is a tensor
# Example: Instantiating the model
model = SentimentModel(num_words, EMBEDDING_DIM, embedding_matrix, max_length)
# Binary Cross Entropy Loss
criterion = nn.BCELoss()
# Adam Optimizer with L2 Regularization
optimizer = optim.Adam(model.parameters(), lr=0.001)
# Print model summary
print(model)
SentimentModel( (embedding_layer): Embedding(30516, 100, padding_idx=0) (lstm_layer): LSTM(100, 32, batch_first=True) (fc_layer): Linear(in_features=32, out_features=1, bias=True) (sigmoid): Sigmoid() )
print("Train...")
for epoch in range(epochs):
model.train()
total_loss = 0
correct = 0
total = 0
for batch in train_loader:
X_batch, y_batch = batch
y_batch = y_batch
# Forward pass
predictions = model(X_batch)
# Compute loss
loss = criterion(predictions, y_batch)
total_loss += loss.item()
# Compute accuracy
predicted_labels = (predictions >= 0.5).float()
correct += (predicted_labels == y_batch).sum().item()
total += y_batch.size(0)
# Backpropagation
optimizer.zero_grad()
loss.backward()
optimizer.step()
train_loss = total_loss / len(train_loader)
train_accuracy = correct / total
# Validation step
with torch.no_grad():
model.eval()
val_loss = 0
val_correct = 0
val_total = 0
for batch in test_loader:
X_batch, y_batch = batch
y_batch = y_batch
predictions = model(X_batch)
loss = criterion(predictions, y_batch)
val_loss += loss.item()
predicted_labels = (predictions >= 0.5).float()
val_correct += (predicted_labels == y_batch).sum().item()
val_total += y_batch.size(0)
val_loss /= len(test_loader)
val_accuracy = val_correct / val_total
print(f"Epoch {epoch+1}/{epochs} - loss: {train_loss:.4f} - accuracy: {train_accuracy:.4f} "
f"- val_loss: {val_loss:.4f} - val_accuracy: {val_accuracy:.4f}")
print("Training complete.")
# Save the trained LSTM model
torch.save(model, "sentiment_model_full.pth")
print("Model saved")
Train... Epoch 1/15 - loss: 0.6354 - accuracy: 0.6437 - val_loss: 0.6046 - val_accuracy: 0.6704 Epoch 2/15 - loss: 0.5834 - accuracy: 0.6973 - val_loss: 0.5639 - val_accuracy: 0.7096 Epoch 3/15 - loss: 0.5528 - accuracy: 0.7254 - val_loss: 0.5481 - val_accuracy: 0.7250 Epoch 4/15 - loss: 0.5369 - accuracy: 0.7378 - val_loss: 0.5398 - val_accuracy: 0.7306 Epoch 5/15 - loss: 0.5253 - accuracy: 0.7457 - val_loss: 0.5266 - val_accuracy: 0.7402 Epoch 6/15 - loss: 0.5194 - accuracy: 0.7494 - val_loss: 0.5175 - val_accuracy: 0.7476 Epoch 7/15 - loss: 0.5114 - accuracy: 0.7562 - val_loss: 0.5145 - val_accuracy: 0.7528 Epoch 8/15 - loss: 0.5069 - accuracy: 0.7583 - val_loss: 0.5242 - val_accuracy: 0.7432 Epoch 9/15 - loss: 0.5012 - accuracy: 0.7630 - val_loss: 0.5101 - val_accuracy: 0.7608 Epoch 10/15 - loss: 0.4968 - accuracy: 0.7646 - val_loss: 0.4995 - val_accuracy: 0.7614 Epoch 11/15 - loss: 0.4908 - accuracy: 0.7706 - val_loss: 0.5061 - val_accuracy: 0.7624 Epoch 12/15 - loss: 0.4861 - accuracy: 0.7718 - val_loss: 0.5018 - val_accuracy: 0.7660 Epoch 13/15 - loss: 0.4909 - accuracy: 0.7697 - val_loss: 0.5011 - val_accuracy: 0.7659 Epoch 14/15 - loss: 0.4831 - accuracy: 0.7773 - val_loss: 0.4996 - val_accuracy: 0.7660 Epoch 15/15 - loss: 0.4776 - accuracy: 0.7773 - val_loss: 0.4998 - val_accuracy: 0.7631 Training complete. Model saved
# Ensure the model is in evaluation mode
model.eval()
# Initialize test loss and accuracy
correct = 0
total = 0
test_loss = 0.0
criterion = nn.BCELoss()
print("Testing...")
# Disable gradient computation for evaluation
with torch.no_grad():
for batch_idx, (inputs, labels) in enumerate(test_loader):
outputs = model(inputs)
# Compute loss
loss = criterion(outputs.squeeze(), labels.float())
test_loss += loss.item()
# Compute accuracy
predicted_labels = (outputs.squeeze() >= 0.5).float()
correct += (predicted_labels == labels).sum().item()
total += labels.size(0)
# Print progress bar with loss and accuracy
print(f"\r{batch_idx+1}/{len(test_loader)} [{'=' * (batch_idx % 20)}] "
f"- loss: {test_loss / (batch_idx+1):.4f} - accuracy: {correct/total:.4f}", end='')
# Compute final test loss and accuracy
test_loss /= len(test_loader)
accuracy = correct / total
Testing... 59/59 [==================] - loss: 0.4996 - accuracy: 0.76317
model.eval()
# Initialize lists to store true labels and predictions
true_labels = []
predictions = []
# Disable gradient calculation for evaluation
with torch.no_grad():
for batch in test_loader:
X_batch, y_batch = batch
# Move data to the same device as the model
X_batch = X_batch.to(next(model.parameters()).device)
y_batch = y_batch.numpy() # Convert true labels to NumPy for evaluation
true_labels.extend(y_batch)
# Get model predictions
y_pred = model(X_batch)
predicted_labels = (y_pred >= 0.5).float().cpu().numpy() # Convert to binary labels
predictions.extend(predicted_labels)
# Convert lists to NumPy arrays
true_labels = np.array(true_labels)
predictions = np.array(predictions)
# Generate Confusion Matrix
conf_matrix = confusion_matrix(true_labels, predictions)
# Plot the Confusion Matrix
plt.figure(figsize=(6,5))
sns.heatmap(conf_matrix, annot=True, fmt="d", cmap="Blues", xticklabels=["Negative", "Positive"], yticklabels=["Negative", "Positive"])
plt.xlabel("Predicted Labels")
plt.ylabel("True Labels")
plt.title("Confusion Matrix")
plt.show()
# Generate Classification Report
class_report = classification_report(true_labels, predictions, target_names=["Negative", "Positive"])
print("Classification Report:\n", class_report)
Classification Report:
precision recall f1-score support
Negative 0.75 0.96 0.84 4913
Positive 0.85 0.38 0.52 2571
accuracy 0.76 7484
macro avg 0.80 0.67 0.68 7484
weighted avg 0.78 0.76 0.73 7484
model.eval()
# Get first 5 samples from the test set
first_5_samples = next(iter(test_loader)) # Fetches one batch
X_batch, y_batch = first_5_samples
# Move input data to the same device as the model
X_batch = X_batch.to(next(model.parameters()).device)
# Get predictions
with torch.no_grad():
y_pred = model(X_batch)
# Convert predictions to binary labels
predicted_labels = (y_pred >= 0.5).float().cpu().numpy()
# Convert true labels to numpy array
true_labels = y_batch.cpu().numpy()
print("First 5 Predictions vs True Labels")
for i in range(5):
print(f"Observation {i+1}: Prediction = {predicted_labels[i]}, True Label = {true_labels[i]}")
First 5 Predictions vs True Labels Observation 1: Prediction = 1.0, True Label = 1.0 Observation 2: Prediction = 0.0, True Label = 1.0 Observation 3: Prediction = 0.0, True Label = 0.0 Observation 4: Prediction = 0.0, True Label = 0.0 Observation 5: Prediction = 0.0, True Label = 0.0
Testing¶
import torch
import numpy as np
import gensim
from torch.nn.utils.rnn import pad_sequence
# Load trained Word2Vec embeddings
word2vec_model = gensim.models.KeyedVectors.load_word2vec_format('group_project_embedding_word2vec.txt', binary=False)
# Define preprocessing function to convert text into sequences
def text_to_sequence(text, word2vec_model, max_length):
words = text.lower().split()
sequence = [word2vec_model.index_to_key.index(word) if word in word2vec_model.index_to_key else 0 for word in words]
return sequence[:max_length] # Trim to max_length
# Define max_length
max_length = 100 # Adjust based on training
# new reviews
new_reviews = [
"Not to my taste, will skip and watch another movie",
"good movie!"
]
# Convert reviews to sequences
new_sequences = [text_to_sequence(review, word2vec_model, max_length) for review in new_reviews]
# Convert to tensor and pad sequences
new_sequences_tensor = [torch.tensor(seq) for seq in new_sequences]
new_sequences_tensor = pad_sequence(new_sequences_tensor, batch_first=True)
# Load full trained model
model = torch.load("sentiment_model_full.pth", weights_only=False)
model.eval()
# Perform predictions
with torch.no_grad():
predictions = model(new_sequences_tensor)
# Convert predictions to binary labels
predicted_labels = ["Positive" if pred > 0.6 else ("Negative" if pred <= 0.4 else "Neutral")for pred in predictions]
for review, label in zip(new_reviews, predicted_labels):
print(f"Review: {review}\nPredicted Sentiment: {label}\n")
Review: Not to my taste, will skip and watch another movie Predicted Sentiment: Neutral Review: good movie! Predicted Sentiment: Negative
3. Time Series Analysis¶
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
# Load file
tsl_stk = pd.read_csv("TSLA_stock_data.csv")
tsl_stk['Date'] = pd.to_datetime(tsl_stk['Date'])
tsl_stk.head(5)
| Date | Open | High | Low | Close | Adj Close | Volume | |
|---|---|---|---|---|---|---|---|
| 0 | 2021-09-30 | 260.333344 | 263.043335 | 258.333344 | 258.493347 | 258.493347 | 53868000 |
| 1 | 2021-10-01 | 259.466675 | 260.260010 | 254.529999 | 258.406677 | 258.406677 | 51094200 |
| 2 | 2021-10-04 | 265.500000 | 268.989990 | 258.706665 | 260.510010 | 260.510010 | 91449900 |
| 3 | 2021-10-05 | 261.600006 | 265.769989 | 258.066681 | 260.196655 | 260.196655 | 55297800 |
| 4 | 2021-10-06 | 258.733337 | 262.220001 | 257.739990 | 260.916656 | 260.916656 | 43898400 |
# Check csv
tsl_stk.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 252 entries, 0 to 251 Data columns (total 7 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 Date 252 non-null datetime64[ns] 1 Open 252 non-null float64 2 High 252 non-null float64 3 Low 252 non-null float64 4 Close 252 non-null float64 5 Adj Close 252 non-null float64 6 Volume 252 non-null int64 dtypes: datetime64[ns](1), float64(5), int64(1) memory usage: 13.9 KB
tsl_twt = pd.read_csv("TSLA_tweets_data.csv")
tsl_twt['Date'] = pd.to_datetime(tsl_twt['Date'])
tsl_twt.head()
| Date | Tweet | |
|---|---|---|
| 0 | 2021-09-30 | In other words, AMD has been giving Tesla pref... |
| 1 | 2021-09-30 | Get ready for a $TSLA _ _ _ _ _ _ Q3 delivery... |
| 2 | 2021-09-30 | Hold. On. Tight. $TSLA |
| 3 | 2021-09-30 | I agree with @freshjiva that $TSLA ‘s EV busin... |
| 4 | 2021-09-30 | Playing in the dirt and #chasingsunsets\n@tesl... |
df = tsl_stk.copy()
3.1 Data Processing¶
#sentiment-score
sentiment_score_t = pd.read_csv('Tweet_sentiment_score.csv')
sentiment_score_d = pd.read_csv('Tweet_sentiment_score(every day).csv')
sentiment_score_t.head(3)
| Date | Tweet | sentiment_score | Positive | Neutral | Negative | |
|---|---|---|---|---|---|---|
| 0 | 2021-09-30 | In other words, AMD has been giving Tesla pref... | 0.6590 | 0.166 | 0.834 | 0.0 |
| 1 | 2021-09-30 | Get ready for a $TSLA _ _ _ _ _ _ Q3 delivery... | 0.4215 | 0.257 | 0.743 | 0.0 |
| 2 | 2021-09-30 | Hold. On. Tight. $TSLA | 0.0000 | 0.000 | 1.000 | 0.0 |
sentiment_score_d.head(3)
| Date | sentiment_score | |
|---|---|---|
| 0 | 2021-09-30 | 0.231552 |
| 1 | 2021-10-01 | 0.233704 |
| 2 | 2021-10-02 | 0.271940 |
sentiment_score_d['Date'] = pd.to_datetime(sentiment_score_d['Date'])
sentiment_score_d.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 365 entries, 0 to 364 Data columns (total 2 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 Date 365 non-null datetime64[ns] 1 sentiment_score 365 non-null float64 dtypes: datetime64[ns](1), float64(1) memory usage: 5.8 KB
# The weekend results will be carried forward to the next trading day,
# combining the scores of Saturday and Sunday into Monday
sc = sentiment_score_d.copy()
sc.set_index('Date', inplace=True)
# Calculate the scores for Monday
weekends = sc[sc.index.weekday >= 5]
mondays = sc[sc.index.weekday == 0]
ave_sentiment = weekends.resample('W-Mon').mean()
print(ave_sentiment.head())
mondays = mondays.add(ave_sentiment['sentiment_score'])/2
# Update the sentiment score DataFrame by removing Saturday and Sunday data
sc = sc[sc.index.weekday < 5]
sc.update(mondays)
sc.reset_index(inplace=True)
sc.head(10)
sentiment_score Date 2021-10-04 0.271755 2021-10-11 0.269741 2021-10-18 0.217852 2021-10-25 0.243395 2021-11-01 0.307372
| Date | sentiment_score | |
|---|---|---|
| 0 | 2021-09-30 | 0.231552 |
| 1 | 2021-10-01 | 0.233704 |
| 2 | 2021-10-04 | 0.135388 |
| 3 | 2021-10-05 | 0.069445 |
| 4 | 2021-10-06 | 0.199940 |
| 5 | 2021-10-07 | 0.192548 |
| 6 | 2021-10-08 | 0.220011 |
| 7 | 2021-10-11 | 0.185286 |
| 8 | 2021-10-12 | 0.191255 |
| 9 | 2021-10-13 | 0.160097 |
# Distribution of sentiment scores
plt.figure(figsize=(6,4))
sns.histplot(sc['sentiment_score'], bins=50, kde=True, color='purple')
plt.title("Distribution of Sentiment Scores")
plt.xlabel("Sentiment")
plt.ylabel("Frequency")
plt.grid(color='gray', linestyle='--', linewidth=0.5, alpha=0.7)
plt.show()
plt.figure(figsize=(10,5))
plt.plot(sc['Date'], sc['sentiment_score'], label="Sentiment Score", color='green')
plt.axhline(y=0, color='red', linestyle='--', label="Neutral Sentiment")
plt.title("Sentiment Scores Over Time")
plt.xlabel("Date")
plt.ylabel("Sentiment Score")
plt.legend()
plt.grid()
plt.show()
plt.close('all')
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
df = pd.read_csv('stock_yfinance_data.csv')
df_tsla = df[df['Stock Name'] == 'TSLA']
df_tsla = df_tsla[['Open', 'Close', 'High', 'Low', 'Volume']]
corr_matrix = df_tsla.corr()
plt.figure(figsize=(6, 5))
sns.heatmap(
corr_matrix,
annot=True,
cmap='coolwarm',
vmin=-1, vmax=1,
square=True
)
plt.title('Correlation Matrix for TSLA')
plt.show()
# Plot of stock prices and sentiment scores over time
from sklearn.preprocessing import StandardScaler
stk,stt = df.copy(),sc.copy()
# Normalize the data
scaler_close = StandardScaler()
stk['Close_normalized'] = scaler_close.fit_transform(stk[['Close']])
scaler_sentiment = StandardScaler()
stt['normalized'] = scaler_sentiment.fit_transform(stt[['sentiment_score']])
plt.figure(figsize=(18, 6))
plt.plot(stk['Date'], stk['Close_normalized'], label="Close Price", color='blue',linewidth = 1.5)
plt.plot(stt['Date'], stt['normalized'], label="Sentiment Score", color='red',linewidth = 1.5,linestyle = '-',alpha = 0.4)
plt.title("TSLA Stock Prices and Sentiment Score Over Time")
plt.xlabel("Date")
plt.grid(True)
plt.legend()
plt.show()
# Merge into one dataset
merged_data = pd.merge(df, sc, on='Date', how='left') #df stockdata/sc sentimentscore data
merged_data.head()
| Date | Open | High | Low | Close | Adj Close | Volume | sentiment_score | |
|---|---|---|---|---|---|---|---|---|
| 0 | 2021-09-30 | 260.333344 | 263.043335 | 258.333344 | 258.493347 | 258.493347 | 53868000 | 0.231552 |
| 1 | 2021-10-01 | 259.466675 | 260.260010 | 254.529999 | 258.406677 | 258.406677 | 51094200 | 0.233704 |
| 2 | 2021-10-04 | 265.500000 | 268.989990 | 258.706665 | 260.510010 | 260.510010 | 91449900 | 0.135388 |
| 3 | 2021-10-05 | 261.600006 | 265.769989 | 258.066681 | 260.196655 | 260.196655 | 55297800 | 0.069445 |
| 4 | 2021-10-06 | 258.733337 | 262.220001 | 257.739990 | 260.916656 | 260.916656 | 43898400 | 0.199940 |
# Save data
merged_data.to_csv('merged_data.csv',index = False)
3.2 ACF/PACF¶
from statsmodels.graphics.tsaplots import plot_acf, plot_pacf
# Plot autocorrelation and partial autocorrelation functions
def acf(variable):
fig, axes = plt.subplots(1,2,figsize = (13,4))
plot_acf(tsl_stk[variable], lags=40, ax = axes[0])
axes[0].set_title(f'Autocorrelation of {variable}')
plot_pacf(tsl_stk[variable], lags=40, ax = axes[1])
axes[1].set_title(f'Partial Autocorrelation of {variable}')
plt.show()
tsl_stk.columns.values
array(['Date', 'Open', 'High', 'Low', 'Close', 'Adj Close', 'Volume'],
dtype=object)
for var in list(tsl_stk.columns.values)[1:]:
acf(var)
3.3 Time Series Prediction¶
# Split the dataset by window
def univariate_data(dataset, start_index, end_index, history_size, target_size):
data = []
labels = []
start_index = start_index + history_size
if end_index is None:
end_index = len(dataset) - target_size
for i in range(start_index, end_index):
indices = range(i - history_size, i)
# Convert dataset to a PyTorch tensor
dataset_tensor = torch.tensor(dataset, dtype=torch.float32)
data.append(dataset_tensor[indices].unsqueeze(-1)) # Adding extra dimension
if target_size == 0:
labels.append(dataset_tensor[i + target_size])
else:
labels.append(dataset_tensor[i : i + target_size])
return torch.stack(data), torch.stack(labels)
3.3.1 Feature Engineering¶
import pandas as pd
#Load dataset
merged_data = pd.read_csv('merged_data.csv')
# Add S&P500 AND Nasadaq Composite Index
index = pd.read_csv('index.csv')
merged_data2 = pd.merge(merged_data,index,on = 'Date',how = 'left')
merged_data2.head()
| Date | Open | High | Low | Close | Adj Close | Volume | sentiment_score | S&P500 | Nasdaq_Index | |
|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 2021-09-30 | 260.333344 | 263.043335 | 258.333344 | 258.493347 | 258.493347 | 53868000 | 0.231552 | 4307.54 | 14689.62 |
| 1 | 2021-10-01 | 259.466675 | 260.260010 | 254.529999 | 258.406677 | 258.406677 | 51094200 | 0.233704 | 4357.04 | 14791.87 |
| 2 | 2021-10-04 | 265.500000 | 268.989990 | 258.706665 | 260.510010 | 260.510010 | 91449900 | 0.135388 | 4300.46 | 14472.12 |
| 3 | 2021-10-05 | 261.600006 | 265.769989 | 258.066681 | 260.196655 | 260.196655 | 55297800 | 0.069445 | 4345.72 | 14674.15 |
| 4 | 2021-10-06 | 258.733337 | 262.220001 | 257.739990 | 260.916656 | 260.916656 | 43898400 | 0.199940 | 4363.55 | 14766.75 |
Explore data information
df = merged_data2
df['Daily_Return(close)'] = df['Close'].pct_change()
df['Daily_Return(S&P500)'] = df['S&P500'].pct_change()
df['Daily_Return(Nasdaq_Index)'] = df['Nasdaq_Index'].pct_change()
annual_volatility1 = df['Daily_Return(close)'].std() * (252 ** 0.5) * 100
annual_volatility2 = df['Daily_Return(S&P500)'].std() * (252 ** 0.5) * 100
annual_volatility3 = df['Daily_Return(Nasdaq_Index)'].std() * (252 ** 0.5) * 100
print("annual_volatility: ")
print(f"Close:{annual_volatility1:.2f}%")
print(f"S&P500:{annual_volatility2:.2f}%")
print(f"Nasdaq_Index:{annual_volatility3:.2f}%")
annual_volatility: Close:64.46% S&P500:21.82% Nasdaq_Index:29.76%
df['Date'] = pd.to_datetime(df['Date'])
import pandas_ta as pta
import matplotlib.pyplot as plt
from matplotlib.dates import MonthLocator, DateFormatter
plt.close('all')
df.ta.adx(high='High', low='Low', close='Close', length=14, append=True)
df.rename(columns={'ADX_14': 'ADX'}, inplace=True)
df['SMA20'] = df['Close'].rolling(window=20).mean()
df['SMA50'] = df['Close'].rolling(window=50).mean()
df['MA_Divergence'] = abs(df['SMA20'] - df['SMA50']) / df['SMA50'] * 100
plt.figure(figsize=(14, 6))
plt.plot(df['Date'], df['ADX'], label='ADX ',lw = 1.5,color = 'purple')
plt.axhline(y=25, color='r', linestyle='--', alpha=0.6,label = 'Strong Trend Threshold (25)')
plt.axhline(y=20, color='g', linestyle='--', alpha=0.6,label = 'Weak Trend Threshold(20)')
plt.title("TSLA Stock Trend Strength (ADX)")
plt.xlabel("Date")
plt.ylabel("ADX Value")
locator = MonthLocator(interval=1)
formatter = DateFormatter('%Y-%m')
plt.gca().xaxis.set_major_locator(locator)
plt.gca().xaxis.set_major_formatter(formatter)
plt.legend()
plt.grid(True, which='major',alpha = 0.5)
plt.show()
plt.close('all')
add extra feature
import seaborn as sns
import matplotlib.pyplot as plt
# Create correlation matrix
correlation = df[['Close', 'sentiment_score', 'S&P500', 'Nasdaq_Index']].corr()
# Plot using seaborn
plt.figure(figsize=(5,5))
heatmap = sns.heatmap(correlation, annot=True, fmt=".2f", cmap="RdBu_r",
vmin=-1, vmax=1, cbar_kws={'label': 'Correlation'})
plt.title("Correlation Matrix with Improved Features")
plt.tight_layout()
plt.show()
plt.close('all')
df1 = merged_data2.copy()
# Add features related to closing prices and sentiment scores
# Calculate future return lag features, which refer to the rate of change in stock prices at future time points
for lag in [1,3,7,15]: #acf:The autocorrelation is significant for lags ranging from 0 to 15.
df1[f'future_returns_lag_{lag}'] = df1['Close'].shift(-lag) / df1['Close'] - 1
# The volatility and lag characteristics of sentiment scores
df1['sentiment_volatility'] = df1['sentiment_score'].rolling(window=3).std()
df1['lagged_sentiment'] = df1['sentiment_score'].shift(1) #窗口1
# Plot the Correlation Matrix with Improved Features
correlation = df1[['sentiment_score', 'sentiment_volatility', 'lagged_sentiment',
'future_returns_lag_1', 'future_returns_lag_3','future_returns_lag_7', 'future_returns_lag_15','S&P500','Nasdaq_Index']].corr()
'''
import plotly.express as px
fig = px.imshow(
correlation,
text_auto=".2f",
color_continuous_scale=px.colors.sequential.RdBu,
labels=dict(color="Correlation"),
title="Correlation Matrix with Improved Features"
)
fig.show()
'''
# Plot using seaborn
plt.figure(figsize=(8,8))
heatmap = sns.heatmap(correlation, annot=True, fmt=".2f", cmap="RdBu_r",
vmin=-1, vmax=1, cbar_kws={'label': 'Correlation'})
plt.title("Correlation Matrix with Improved Features")
plt.tight_layout()
plt.show()
plt.close('all')
df1.fillna(0,inplace = True)
df1.head()
| Date | Open | High | Low | Close | Adj Close | Volume | sentiment_score | S&P500 | Nasdaq_Index | ... | DMN_14 | SMA20 | SMA50 | MA_Divergence | future_returns_lag_1 | future_returns_lag_3 | future_returns_lag_7 | future_returns_lag_15 | sentiment_volatility | lagged_sentiment | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 2021-09-30 | 260.333344 | 263.043335 | 258.333344 | 258.493347 | 258.493347 | 53868000 | 0.231552 | 4307.54 | 14689.62 | ... | 0.0 | 0.0 | 0.0 | 0.0 | -0.000335 | 0.006589 | 0.021226 | 0.152834 | 0.000000 | 0.000000 |
| 1 | 2021-10-01 | 259.466675 | 260.260010 | 254.529999 | 258.406677 | 258.406677 | 51094200 | 0.233704 | 4357.04 | 14791.87 | ... | 0.0 | 0.0 | 0.0 | 0.0 | 0.008140 | 0.009713 | 0.039344 | 0.173447 | 0.000000 | 0.231552 |
| 2 | 2021-10-04 | 265.500000 | 268.989990 | 258.706665 | 260.510010 | 260.510010 | 91449900 | 0.135388 | 4300.46 | 14472.12 | ... | 0.0 | 0.0 | 0.0 | 0.0 | -0.001203 | 0.015457 | 0.037810 | 0.311351 | 0.056152 | 0.233704 |
| 3 | 2021-10-05 | 261.600006 | 265.769989 | 258.066681 | 260.196655 | 260.196655 | 55297800 | 0.069445 | 4345.72 | 14674.15 | ... | 0.0 | 0.0 | 0.0 | 0.0 | 0.002767 | 0.006277 | 0.048335 | 0.304693 | 0.082659 | 0.135388 |
| 4 | 2021-10-06 | 258.733337 | 262.220001 | 257.739990 | 260.916656 | 260.916656 | 43898400 | 0.199940 | 4363.55 | 14766.75 | ... | 0.0 | 0.0 | 0.0 | 0.0 | 0.013874 | 0.011741 | 0.077011 | 0.325915 | 0.065248 | 0.069445 |
5 rows × 25 columns
3.3.2 Model Design¶
import matplotlib.pyplot as plt
import numpy as np
model_data = df1.copy()
# Chosen features
import torch
import torch.nn as nn
#Taking different features into consideration to show different performance
features_considered = ['Close'
# ,'sentiment_score'#]
# , 'sentiment_volatility'
# , 'lagged_sentiment'
, 'future_returns_lag_1'
, 'future_returns_lag_3'
, 'future_returns_lag_7'
, 'future_returns_lag_15'
#, 'S&P500','Nasdaq_Index'
]
features_considered_sentiment = ['Close','sentiment_score']
features_considered_sentiment_index = ['Close','sentiment_score','S&P500','Nasdaq_Index']
features_considered_close = ['Close']
features = model_data[features_considered] # Feature data
features.index = model_data['Date']
features.head()
| Close | future_returns_lag_1 | future_returns_lag_3 | future_returns_lag_7 | future_returns_lag_15 | |
|---|---|---|---|---|---|
| Date | |||||
| 2021-09-30 | 258.493347 | -0.000335 | 0.006589 | 0.021226 | 0.152834 |
| 2021-10-01 | 258.406677 | 0.008140 | 0.009713 | 0.039344 | 0.173447 |
| 2021-10-04 | 260.510010 | -0.001203 | 0.015457 | 0.037810 | 0.311351 |
| 2021-10-05 | 260.196655 | 0.002767 | 0.006277 | 0.048335 | 0.304693 |
| 2021-10-06 | 260.916656 | 0.013874 | 0.011741 | 0.077011 | 0.325915 |
# Visualization
features.plot(subplots = True,figsize = (7,7))
array([<Axes: xlabel='Date'>, <Axes: xlabel='Date'>,
<Axes: xlabel='Date'>, <Axes: xlabel='Date'>,
<Axes: xlabel='Date'>], dtype=object)
# Convert the selected features to a NumPy array
dataset = features.values
TRAIN_SPLIT = int(len(dataset) * 0.7) # 70% as training dataset
# Calculate the mean and standard deviation of the training split along each feature dimension
data_mean = torch.tensor(dataset[:TRAIN_SPLIT].mean(axis=0), dtype=torch.float32)
data_std = torch.tensor(dataset[:TRAIN_SPLIT].std(axis=0), dtype=torch.float32)
# Normalize the dataset using the calculated mean and standard deviation
dataset = (torch.tensor(dataset, dtype=torch.float32) - data_mean) / data_std
# Generates training samples and labels from a multivariate time series dataset.
def multivariate_data(dataset, target, start_index, end_index, history_size,
target_size, step, single_step=False):
data = []
labels = []
start_index = start_index + history_size
if end_index is None:
end_index = len(dataset) - target_size
for i in range(start_index, end_index):
indices = range(i - history_size, i, step)
dataset_tensor = dataset if isinstance(dataset, torch.Tensor) else torch.tensor(dataset, dtype=torch.float32)
target_tensor = target if isinstance(target, torch.Tensor) else torch.tensor(target, dtype=torch.float32)
# Append the input data (history) from this window
data.append(dataset_tensor[indices])
# Append the target data
if single_step:
labels.append(target_tensor[i + target_size])
else:
labels.append(target_tensor[i:i + target_size])
return torch.stack(data), torch.stack(labels)
# Create training and validation datasets
# Define the history size and target size
FUTURE_STEP = 15
past_history = 35
future_target = FUTURE_STEP
STEP = 1
# Create training and validation datasets for the multi-step model
x_train_multi, y_train_multi = multivariate_data(
dataset, dataset[:, 0], 0, TRAIN_SPLIT, past_history, future_target, STEP
)
x_val_multi, y_val_multi = multivariate_data(
dataset, dataset[:, 0], TRAIN_SPLIT, None, past_history, future_target, STEP
)
print(f"x_train_multi shape: {x_train_multi.shape}, y_train_multi shape: {y_train_multi.shape}")
print(f"x_val_multi shape: {x_val_multi.shape}, y_val_multi shape: {y_val_multi.shape}")
x_train_multi shape: torch.Size([141, 35, 5]), y_train_multi shape: torch.Size([141, 15]) x_val_multi shape: torch.Size([26, 35, 5]), y_val_multi shape: torch.Size([26, 15])
#loader
from torch.utils.data import DataLoader, TensorDataset
# Define the batch size
BATCH_SIZE = 100
# Create TensorDatasets for the training and validation data
train_data_multi = TensorDataset(x_train_multi, y_train_multi)
val_data_multi = TensorDataset(x_val_multi, y_val_multi)
# Create DataLoaders for batching and shuffling
train_data_loader_multi = DataLoader(train_data_multi, batch_size=BATCH_SIZE, shuffle=True)
val_data_loader_multi = DataLoader(val_data_multi, batch_size=BATCH_SIZE, shuffle=False)
dataset
tensor([[-1.1726, -0.0056, 0.0846, 0.1794, 0.8712],
[-1.1745, 0.1909, 0.1263, 0.3360, 0.9891],
[-1.1287, -0.0257, 0.2029, 0.3228, 1.7776],
...,
[-0.6404, 0.4012, -0.0032, -0.0041, -0.0026],
[-0.5343, -1.5768, -0.0032, -0.0041, -0.0026],
[-0.9611, 0.0021, -0.0032, -0.0041, -0.0026]])
3.3.3 Moving Average (Baseline)¶
#split the dataset by window
def univariate_data(dataset, start_index, end_index, history_size, target_size):
data = []
labels = []
start_index = start_index + history_size
if end_index is None:
end_index = len(dataset) - target_size
for i in range(start_index, end_index):
indices = range(i - history_size, i)
# Convert dataset to a PyTorch tensor
dataset_tensor = torch.tensor(dataset, dtype=torch.float32)
data.append(dataset_tensor[indices].unsqueeze(-1)) # Adding extra dimension
if target_size == 0:
labels.append(dataset_tensor[i + target_size])
else:
labels.append(dataset_tensor[i : i + target_size])
return torch.stack(data), torch.stack(labels)
#define baseline testing and training set
uni_data = dataset[:,0]
print('uni_data shape:',uni_data.shape)
HISTORY_SIZE = 35 # Number of past time steps
TARGET_SIZE = 0 # Predict the next step
x_train_uni, y_train_uni = univariate_data(uni_data, 0, TRAIN_SPLIT, HISTORY_SIZE, TARGET_SIZE)
x_val_uni, y_val_uni = univariate_data(uni_data, TRAIN_SPLIT, None, HISTORY_SIZE, TARGET_SIZE)
print(f"x_train_uni shape: {x_train_uni.shape}")
print(f"y_train_uni shape: {y_train_uni.shape}")
print(f"x_val_uni shape: {x_val_uni.shape}")
print(f"y_val_uni shape: {y_val_uni.shape}")
uni_data shape: torch.Size([252]) x_train_uni shape: torch.Size([141, 35, 1]) y_train_uni shape: torch.Size([141]) x_val_uni shape: torch.Size([41, 35, 1]) y_val_uni shape: torch.Size([41])
C:\Users\aa139\AppData\Local\Temp\ipykernel_55528\4179888868.py:13: UserWarning: To copy construct from a tensor, it is recommended to use sourceTensor.clone().detach() or sourceTensor.clone().detach().requires_grad_(True), rather than torch.tensor(sourceTensor).
# Function to create time steps for plotting
def create_time_steps(length):
time_steps = []
for i in range(-length, 0, 1):
time_steps.append(i)
return time_steps
def show_plot(plot_data, delta, title):
labels = ['History', 'True Future', 'Model Prediction']
marker = ['.-', 'rx', 'go']
time_steps = create_time_steps(plot_data[0].shape[0])
if delta:
future = list(range(0,delta,1))
else: #single step
future = 0
plt.figure(figsize=(8,5))
plt.title(title)
for i, x in enumerate(plot_data):
if i:#true future data
plt.plot(future, plot_data[i], marker[i], markersize=6, label=labels[i])
else:#history data
plt.plot(time_steps, plot_data[i].flatten(), marker[i], label=labels[i])
plt.legend()
plt.grid(alpha = 0.3)
plt.xlim([time_steps[0] - 1, (future[-1] + 2)])
plt.xlabel('Time-Step')
return plt
One Step Baseline¶
# Baseline function to calculate the mean of the history
def baseline(history):
return torch.mean(history).item()
# Generate a baseline prediction
example_data = x_train_uni[0] # Single example (tensor)
example_label = y_train_uni[0] # Corresponding label (tensor)
# Compute the baseline prediction
baseline_prediction = baseline(example_data)
# Show the plot using the previously defined show_plot function
show_plot(
[example_data.numpy(), example_label.numpy(), np.array(baseline_prediction)],
delta=0,
title="Baseline Prediction"
).show()
def multi_step_baseline(history, window_size, target_size):
predictions = []
current_window = history.squeeze().numpy().copy() # [35]
for _ in range(target_size):
pred = np.mean(current_window[-window_size:])
predictions.append(pred)
current_window = np.append(current_window[1:], pred)
return predictions
# Function to create time steps for plotting
def create_time_steps(length):
time_steps = []
for i in range(-length, 0, 1):
time_steps.append(i)
return time_steps
def show_plot(plot_data, delta, title):
labels = ['History', 'True Future', 'Model Prediction']
marker = ['.-', 'rx', 'go']
time_steps = create_time_steps(plot_data[0].shape[0])
if delta:
future = list(range(0,delta,1))
else: #single step
future = 0
plt.figure(figsize=(8,5))
plt.title(title)
for i, x in enumerate(plot_data):
if i:#true future data
plt.plot(future, plot_data[i], marker[i], markersize=6, label=labels[i])
else:#history data
plt.plot(time_steps, plot_data[i].flatten(), marker[i], label=labels[i])
plt.legend()
plt.grid(alpha = 0.3)
plt.xlim([time_steps[0] - 1, (future[-1] + 2)])
plt.xlabel('Time-Step')
return plt
uni_data = dataset[:,0]
print('uni_data shape:',uni_data.shape)
HISTORY_SIZE = 35 # Number of past time steps
TARGET_SIZE = 15
x_train_uni, y_train_uni = univariate_data(uni_data, 0, TRAIN_SPLIT, HISTORY_SIZE, TARGET_SIZE)
x_val_uni, y_val_uni = univariate_data(uni_data, TRAIN_SPLIT, None, HISTORY_SIZE, TARGET_SIZE)
print(f"x_train_uni shape: {x_train_uni.shape}")
print(f"y_train_uni shape: {y_train_uni.shape}")
print(f"x_val_uni shape: {x_val_uni.shape}")
print(f"y_val_uni shape: {y_val_uni.shape}")
uni_data shape: torch.Size([252]) x_train_uni shape: torch.Size([141, 35, 1]) y_train_uni shape: torch.Size([141, 15]) x_val_uni shape: torch.Size([26, 35, 1]) y_val_uni shape: torch.Size([26, 15])
C:\Users\aa139\AppData\Local\Temp\ipykernel_55528\4179888868.py:13: UserWarning: To copy construct from a tensor, it is recommended to use sourceTensor.clone().detach() or sourceTensor.clone().detach().requires_grad_(True), rather than torch.tensor(sourceTensor).
Multi Step Baseline¶
# Generate a baseline prediction
example_data = x_train_uni[45] # Single example (tensor)
example_label = y_train_uni[45] # Corresponding label (tensor)
BASELINE_WINDOW_SIZE = 15
# Compute the baseline prediction
baseline_prediction = multi_step_baseline(example_data,BASELINE_WINDOW_SIZE,TARGET_SIZE)
plt.show()
# Show the plot using the previously defined show_plot function
show_plot(
[example_data.numpy(), example_label.numpy(), np.array(baseline_prediction)],
delta= TARGET_SIZE,
title="MA-Baseline Prediction"
).show()
from sklearn.metrics import mean_absolute_error, mean_squared_error
def evaluate_regression(y_true, y_pred):
metrics = {
'MAE': mean_absolute_error(y_true, y_pred),
'RMSE': np.sqrt(mean_squared_error(y_true, y_pred)),
'MAPE': np.mean(np.abs((y_true - y_pred) / y_true)) * 100
}
return metrics
y_true = example_label.numpy()
y_pred = np.array(baseline_prediction)
print(evaluate_regression(y_true, y_pred))
#pd.DataFrame([evaluate_regression(y_true, y_pred)])
{'MAE': 0.257088303565979, 'RMSE': 0.32571183386762115, 'MAPE': 34.09973382949829}
BASELINE_WINDOW_SIZE = 15
mae = []
mse = []
rmse = []
for i in range(x_train_uni.shape[0]):
example_data = x_train_uni[i] # Single example (tensor)
example_label = y_train_uni[i] # Corresponding label (tensor)
# Compute the baseline prediction
baseline_prediction = multi_step_baseline(example_data,BASELINE_WINDOW_SIZE,TARGET_SIZE)
y_true = example_label.numpy()
y_pred = np.array(baseline_prediction)
mae.append(mean_absolute_error(y_true, y_pred))
mse.append(mean_squared_error(y_true, y_pred))
rmse.append(np.sqrt(mean_squared_error(y_true, y_pred)))
print(f'rmse:{np.mean(rmse)} mse:{np.mean(mse)} mae:{np.mean(mae)}')
rmse:0.7912054789702202 mse:0.7531945594183519 mae:0.6948035638383094
3.3.4 LSTM Model¶
#define loss function
def rmae_loss(predictions, targets):
absolute_errors = torch.abs(predictions - targets)
mean_absolute_error = torch.mean(absolute_errors)
root_mean_absolute_error = torch.sqrt(mean_absolute_error)
return root_mean_absolute_error
loss_fn2 = rmae_loss
import torch
import torch.nn as nn
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
input_size = x_train_multi.shape[2] # Number of input features
hidden_size1 = 64
hidden_size2 = 32
output_size = FUTURE_STEP # Number of future values to predict
# Define a multi-layer LSTM model
class LSTMModel(nn.Module):
def __init__(self):
super(LSTMModel, self).__init__()
self.lstm1 = nn.LSTM(input_size, hidden_size1, batch_first=True)
self.lstm2 = nn.LSTM(hidden_size1, hidden_size2, num_layers=2, dropout=0.4, batch_first=True)
self.dense = nn.Linear(hidden_size2, output_size)
def forward(self, x):
lstm_out, _ = self.lstm1(x)
lstm_out, _ = self.lstm2(lstm_out)
output = self.dense(lstm_out[:, -1, :]) # Take the output from the last time step
return output
# Instantiate the model
model = LSTMModel().to(device)
# Define the Optimizer
optimizer = torch.optim.RMSprop(
list(model.lstm1.parameters()) + list(model.lstm2.parameters()) + list(model.dense.parameters()),
lr=0.001, alpha=0.9
)
loss_fn = nn.L1Loss() # Use MAE as loss function
print(f"LSTM1: {model.lstm1}")
print(f"LSTM2: {model.lstm2}")
print(f"Dense: {model.dense}")
LSTM1: LSTM(5, 64, batch_first=True) LSTM2: LSTM(64, 32, num_layers=2, batch_first=True, dropout=0.4) Dense: Linear(in_features=32, out_features=15, bias=True)
# Training model
EPOCHS = 50
EVALUATION_INTERVAL = 100
validation_steps = 50
# Training loop
best_val_loss = 0
patience = 10
no_improve = 0
for epoch in range(EPOCHS):
print(f"Epoch {epoch + 1}/{EPOCHS}")
model.train()
train_loss = 0.0
for step, (x_batch, y_batch) in enumerate(train_data_loader_multi):
x_batch, y_batch = x_batch.to(device), y_batch.to(device)
# Forward pass
predictions = model(x_batch)
# Calculate loss
loss_mae = loss_fn(predictions, y_batch)
loss_rmae = loss_fn2(predictions, y_batch)
loss = loss_mae + loss_rmae
# Backward pass
optimizer.zero_grad()
loss.backward()
optimizer.step()
# Accumulate training loss
train_loss += loss.item()
# Limit steps per epoch
if step + 1 == EVALUATION_INTERVAL:
break
# Average training loss
train_loss /= step + 1
val_loss = 0
model.eval()
with torch.no_grad():
for step, (x_batch, y_batch) in enumerate(val_data_loader_multi):
x_batch, y_batch = x_batch.to(device), y_batch.to(device)
if step + 1 == validation_steps:
break
# Forward pass
predictions = model(x_batch)
# Calculate loss
loss_mae = loss_fn(predictions, y_batch)
loss_rmae = loss_fn2(predictions, y_batch)
loss = loss_mae + loss_rmae
# Accumulate validation loss
val_loss += loss.item()
# Average validation loss
val_loss /= step + 1
if val_loss < best_val_loss:
best_val_loss = val_loss
no_improve = 0
else:
no_improve += 1
if no_improve >= patience:
print("Early stopping")
break
print(f"MAE+RMAE - train_loss: {train_loss:.4f} - val_loss: {val_loss:.4f}")
Epoch 1/50 MAE+RMAE - train_loss: 1.7899 - val_loss: 1.1859 Epoch 2/50 MAE+RMAE - train_loss: 1.7711 - val_loss: 1.1449 Epoch 3/50 MAE+RMAE - train_loss: 1.7751 - val_loss: 1.0828 Epoch 4/50 MAE+RMAE - train_loss: 1.6671 - val_loss: 0.9896 Epoch 5/50 MAE+RMAE - train_loss: 1.6392 - val_loss: 0.9145 Epoch 6/50 MAE+RMAE - train_loss: 1.4807 - val_loss: 0.9812 Epoch 7/50 MAE+RMAE - train_loss: 1.4657 - val_loss: 1.0193 Epoch 8/50 MAE+RMAE - train_loss: 1.4024 - val_loss: 1.0488 Epoch 9/50 MAE+RMAE - train_loss: 1.3466 - val_loss: 1.1033 Epoch 10/50 Early stopping
Model Prediction Plot¶
# Function to create time steps for plotting
import matplotlib.pyplot as plt
import numpy as np
def create_time_steps(length):
return list(range(-length, 0))
# Function for multi-step plotting
def multi_step_plot(history, true_future, prediction, title="Multi-Step Prediction"):
plt.figure(figsize=(8, 4))
num_in = create_time_steps(len(history))
num_out = len(true_future)
# Plot the history
plt.plot(num_in, history[:, 0], label="History") # Assuming index 1 corresponds to the target feature
# Plot the true future
plt.plot(np.arange(num_out), true_future, 'bo', label="True Future")
# Plot the predicted future
if prediction is not None:
plt.plot(np.arange(num_out), prediction, 'ro', label="Predicted Future")
plt.legend(loc="upper left")
plt.title(title)
plt.grid(color = 'gray',linestyle = '--',alpha = 0.7)
plt.ylim(-3,0.5)
plt.show()
model.eval()
LSTM_1: Consider close prices:¶
features_considered = ['Close']
#Testing
model.eval
all_predictions = []
all_targets = []
# Iterate through the validation dataset
with torch.no_grad(): # Disable gradient calculation for evaluation
for i, (x_batch, y__batch) in enumerate(val_data_loader_multi):
x = x_batch.to(device)
y = y_batch.to(device)
# Perform forward pass
predictions = model(x_batch)
all_predictions.append(predictions.cpu())
all_targets.append(y_batch.cpu())
# Print the shape of predictions
print(predictions.shape)
break
all_predictions = torch.cat(all_predictions, dim=0)
all_targets = torch.cat(all_targets, dim=0)
# Calculating MSE, RMSE, MAE
mse = torch.mean((all_predictions - all_targets) ** 2)
rmse = torch.sqrt(mse)
mae = torch.mean(torch.abs(all_predictions - all_targets))
print(f"Test RMSE: {rmse.item():.4f}")
print(f"Test MSE: {mse.item():.4f}")
print(f"Test MAE: {mae.item():.4f}")
torch.Size([26, 15]) Test RMSE: 0.4381 Test MSE: 0.1919 Test MAE: 0.3649
# Multi Features
with torch.no_grad(): # Disable gradient calculation for evaluation
for i, (x, y) in enumerate(val_data_loader_multi):
x, y = x.to(device), y.to(device)
# Perform forward pass
predictions = model(x) # Use the output of the last time step
for j in range(10):
history = x[j].cpu().numpy() # First sample in the batch
true_future = y[j].cpu().numpy() # Corresponding ground truth
predicted_future = predictions[j].cpu().numpy() # Model's prediction
multi_step_plot(history, true_future, predicted_future, title=f"Sample {j + 1}")
LSTM_2: Consider close prices and future_return:¶
features_considered = ['Close', 'future_returns_lag_1' , 'future_returns_lag_3' , 'future_returns_lag_7' , 'future_returns_lag_15']
#Testing
model.eval
all_predictions = []
all_targets = []
# Iterate through the validation dataset
with torch.no_grad(): # Disable gradient calculation for evaluation
for i, (x_batch, y__batch) in enumerate(val_data_loader_multi):
x = x_batch.to(device)
y = y_batch.to(device)
# Perform forward pass
predictions = model(x_batch)
all_predictions.append(predictions.cpu())
all_targets.append(y_batch.cpu())
# Print the shape of predictions
print(predictions.shape)
break
all_predictions = torch.cat(all_predictions, dim=0)
all_targets = torch.cat(all_targets, dim=0)
# Calculating MSE, RMSE, MAE
mse = torch.mean((all_predictions - all_targets) ** 2)
rmse = torch.sqrt(mse)
mae = torch.mean(torch.abs(all_predictions - all_targets))
print(f"Test RMSE: {rmse.item():.4f}")
print(f"Test MSE: {mse.item():.4f}")
print(f"Test MAE: {mae.item():.4f}")
torch.Size([26, 15]) Test RMSE: 0.5059 Test MSE: 0.2559 Test MAE: 0.4495
# Multi Features
with torch.no_grad(): # Disable gradient calculation for evaluation
for i, (x, y) in enumerate(val_data_loader_multi):
x, y = x.to(device), y.to(device)
# Perform forward pass
predictions = model(x) # Use the output of the last time step
for j in range(10):
history = x[j].cpu().numpy() # First sample in the batch
true_future = y[j].cpu().numpy() # Corresponding ground truth
predicted_future = predictions[j].cpu().numpy() # Model's prediction
multi_step_plot(history, true_future, predicted_future, title=f"Sample {j + 1}")
After adding future_return_lag, the loss increased. LSTM itself can capture correlations in the data. The correlation between future_return and close price is linear and easier to capture, which may lead to data redundancy and increased noise. Therefore, variables related to future_return_lag are not used.
LSTM_3: Consider close prices and sentiment_score:¶
features_considered = ['Close','sentiment_score']
#Testing
model.eval
all_predictions = []
all_targets = []
# Iterate through the validation dataset
with torch.no_grad(): # Disable gradient calculation for evaluation
for i, (x_batch, y__batch) in enumerate(val_data_loader_multi):
x = x_batch.to(device)
y = y_batch.to(device)
# Perform forward pass
predictions = model(x_batch)
all_predictions.append(predictions.cpu())
all_targets.append(y_batch.cpu())
# Print the shape of predictions
print(predictions.shape)
break
all_predictions = torch.cat(all_predictions, dim=0)
all_targets = torch.cat(all_targets, dim=0)
# Calculating MSE, RMSE, MAE
mse = torch.mean((all_predictions - all_targets) ** 2)
rmse = torch.sqrt(mse)
mae = torch.mean(torch.abs(all_predictions - all_targets))
print(f"Test RMSE: {rmse.item():.4f}")
print(f"Test MSE: {mse.item():.4f}")
print(f"Test MAE: {mae.item():.4f}")
torch.Size([26, 15]) Test RMSE: 0.2905 Test MSE: 0.0844 Test MAE: 0.2378
# Multi Features
with torch.no_grad(): # Disable gradient calculation for evaluation
for i, (x, y) in enumerate(val_data_loader_multi):
x, y = x.to(device), y.to(device)
# Perform forward pass
predictions = model(x) # Use the output of the last time step
for j in range(10):
history = x[j].cpu().numpy() # First sample in the batch
true_future = y[j].cpu().numpy() # Corresponding ground truth
predicted_future = predictions[j].cpu().numpy() # Model's prediction
multi_step_plot(history, true_future, predicted_future, title=f"Sample {j + 1}")
LSTM_4: Consider sentiment features:¶
features_considered = ['Close' ,'sentiment_score' , 'sentiment_volatility' ]
#Testing
model.eval
all_predictions = []
all_targets = []
# Iterate through the validation dataset
with torch.no_grad(): # Disable gradient calculation for evaluation
for i, (x_batch, y__batch) in enumerate(val_data_loader_multi):
x = x_batch.to(device)
y = y_batch.to(device)
# Perform forward pass
predictions = model(x_batch)
all_predictions.append(predictions.cpu())
all_targets.append(y_batch.cpu())
# Print the shape of predictions
print(predictions.shape)
break
all_predictions = torch.cat(all_predictions, dim=0)
all_targets = torch.cat(all_targets, dim=0)
# Calculating MSE, RMSE, MAE
mse = torch.mean((all_predictions - all_targets) ** 2)
rmse = torch.sqrt(mse)
mae = torch.mean(torch.abs(all_predictions - all_targets))
print(f"Test RMSE: {rmse.item():.4f}")
print(f"Test MSE: {mse.item():.4f}")
print(f"Test MAE: {mae.item():.4f}")
torch.Size([26, 15]) Test RMSE: 0.2680 Test MSE: 0.0718 Test MAE: 0.2206
# Multi Features
with torch.no_grad(): # Disable gradient calculation for evaluation
for i, (x, y) in enumerate(val_data_loader_multi):
x, y = x.to(device), y.to(device)
# Perform forward pass
predictions = model(x) # Use the output of the last time step
for j in range(10):
history = x[j].cpu().numpy() # First sample in the batch
true_future = y[j].cpu().numpy() # Corresponding ground truth
predicted_future = predictions[j].cpu().numpy() # Model's prediction
multi_step_plot(history, true_future, predicted_future, title=f"Sample {j + 1}")
LSTM_5: Consider only index:¶
features_considered = ['Close' , 'S&P500','Nasdaq_Index']
#Testing
model.eval
all_predictions = []
all_targets = []
# Iterate through the validation dataset
with torch.no_grad(): # Disable gradient calculation for evaluation
for i, (x_batch, y__batch) in enumerate(val_data_loader_multi):
x = x_batch.to(device)
y = y_batch.to(device)
# Perform forward pass
predictions = model(x_batch)
all_predictions.append(predictions.cpu())
all_targets.append(y_batch.cpu())
# Print the shape of predictions
print(predictions.shape)
break
all_predictions = torch.cat(all_predictions, dim=0)
all_targets = torch.cat(all_targets, dim=0)
# Calculating MSE, RMSE, MAE
mse = torch.mean((all_predictions - all_targets) ** 2)
rmse = torch.sqrt(mse)
mae = torch.mean(torch.abs(all_predictions - all_targets))
print(f"Test RMSE: {rmse.item():.4f}")
print(f"Test MSE: {mse.item():.4f}")
print(f"Test MAE: {mae.item():.4f}")
torch.Size([26, 15]) Test RMSE: 0.5512 Test MSE: 0.3038 Test MAE: 0.4635
# Multi Features
with torch.no_grad(): # Disable gradient calculation for evaluation
for i, (x, y) in enumerate(val_data_loader_multi):
x, y = x.to(device), y.to(device)
# Perform forward pass
predictions = model(x) # Use the output of the last time step
for j in range(10):
history = x[j].cpu().numpy() # First sample in the batch
true_future = y[j].cpu().numpy() # Corresponding ground truth
predicted_future = predictions[j].cpu().numpy() # Model's prediction
multi_step_plot(history, true_future, predicted_future, title=f"Sample {j + 1}")
LSTM_6: Consider all the features:¶
features_considered = ['Close' ,'sentiment_score' , 'sentiment_volatility' , 'S&P500','Nasdaq_Index']
#Testing
model.eval
all_predictions = []
all_targets = []
# Iterate through the validation dataset
with torch.no_grad(): # Disable gradient calculation for evaluation
for i, (x_batch, y__batch) in enumerate(val_data_loader_multi):
x = x_batch.to(device)
y = y_batch.to(device)
# Perform forward pass
predictions = model(x_batch)
all_predictions.append(predictions.cpu())
all_targets.append(y_batch.cpu())
# Print the shape of predictions
print(predictions.shape)
break
all_predictions = torch.cat(all_predictions, dim=0)
all_targets = torch.cat(all_targets, dim=0)
# Calculating MSE, RMSE, MAE
mse = torch.mean((all_predictions - all_targets) ** 2)
rmse = torch.sqrt(mse)
mae = torch.mean(torch.abs(all_predictions - all_targets))
print(f"Test RMSE: {rmse.item():.4f}")
print(f"Test MSE: {mse.item():.4f}")
print(f"Test MAE: {mae.item():.4f}")
torch.Size([26, 15]) Test RMSE: 0.3022 Test MSE: 0.0913 Test MAE: 0.2491
# Multi Features
with torch.no_grad(): # Disable gradient calculation for evaluation
for i, (x, y) in enumerate(val_data_loader_multi):
x, y = x.to(device), y.to(device)
# Perform forward pass
predictions = model(x) # Use the output of the last time step
for j in range(10):
history = x[j].cpu().numpy() # First sample in the batch
true_future = y[j].cpu().numpy() # Corresponding ground truth
predicted_future = predictions[j].cpu().numpy() # Model's prediction
multi_step_plot(history, true_future, predicted_future, title=f"Sample {j + 1}")